This is a draft, the analysis is still on-going.
This document focuses on exploring the relationship between the census variables.
library(tidyverse)
library(magrittr)
library(knitr)
library(GGally)
Load the transformed census data.
census_data_trans <-
read_csv(
"../storage/dati-cpa_2011_all-trans-v0_0_4.csv",
col_types = paste(c(rep("c", 12), rep("d", 125)), collapse="")
)
Calculate the correlation between the transformed variables to identify those that might be excluded from the analysis.
candidate_vars <-
census_data_trans %>%
select(P1_norm_log10_std:E30_E31_norm_std) %>%
colnames()
candidate_vars_cor <- NA
for (i in 1:(length(candidate_vars) - 1)) {
for(j in (i + 1):length(candidate_vars)) {
#cat("Calculating correlation between", candidate_vars[i], "and", candidate_vars[j],"\n")
census_data_trans_sample <-
census_data_trans %>%
slice_sample(prop = 0.01)
ij_cor_test <- cor.test(
census_data_trans_sample %>% pull(candidate_vars[i]),
census_data_trans_sample %>% pull(candidate_vars[j]),
method = "kendall"
)
if(i == 1 & j == 2){
candidate_vars_cor <-
tibble(
var_i = candidate_vars[i],
var_j = candidate_vars[j],
estimate = ij_cor_test %$% estimate %>% as.numeric(),
p_value = ij_cor_test %$% p.value %>% as.numeric()
)
} else {
candidate_vars_cor <-
candidate_vars_cor %>%
add_row(
var_i = candidate_vars[i],
var_j = candidate_vars[j],
estimate = ij_cor_test %$% estimate %>% as.numeric(),
p_value = ij_cor_test %$% p.value %>% as.numeric()
)
}
}
}
Further explore the most highly correlated variables, including all correlations with coefficient above \(0.5\) (indicating a share variability above \(25%\), in orange in the annotated chart below) and focusing in particular on correlations with coefficient above \(0.7\) (indicating a share variability above \(50%\), in red in the annotated chart below).
correlations_cutoff_p_value <- 0.01
correlations_cutoff_estimate <- 0.5
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
kable()
| var_i | var_j | estimate | p_value |
|---|---|---|---|
| P1_norm_log10_std | A44_norm_log10_std | 0.9138809 | 0 |
| P1_norm_log10_std | PF1_norm_log10_std | 0.9406808 | 0 |
| P1_norm_log10_std | E1_norm_log10_std | 0.5447581 | 0 |
| P7_norm_log10_std | P29_norm_log10_std | 0.5189718 | 0 |
| P9_norm_std | P53_norm_std | 0.5028844 | 0 |
| P17_norm_log10_std | P131_norm_log10_std | 0.5795901 | 0 |
| P29_norm_log10_std | P139_norm_std | 0.5779303 | 0 |
| P33_norm_std | P132_norm_std | 0.5205444 | 0 |
| P64_norm_std | P65_norm_std | 0.8209654 | 0 |
| ST1_norm_log10_std | ST2_norm_std | 0.5786968 | 0 |
| ST1_norm_log10_std | ST3_norm_std | 0.5546127 | 0 |
| ST2_norm_std | ST3_norm_std | 0.5661384 | 0 |
| A3_norm_std | A5_A6_A7_norm_std | 0.9762379 | 0 |
| A44_norm_log10_std | PF1_norm_log10_std | 0.9270664 | 0 |
| A44_norm_log10_std | E1_norm_log10_std | 0.5687334 | 0 |
| A44_norm_log10_std | E20_norm_std | 0.5047472 | 0 |
| PF1_norm_log10_std | E1_norm_log10_std | 0.5562196 | 0 |
| PF1_norm_log10_std | E24_E25_E26_norm_std | 0.5253912 | 0 |
| PF2_norm_std | PF6_PF7_PF8_norm_std | 0.7295336 | 0 |
| E20_norm_std | E24_E25_E26_norm_std | 0.6188560 | 0 |
| E20_norm_std | E27_norm_log10_std | 0.5620777 | 0 |
| E24_E25_E26_norm_std | E27_norm_log10_std | 0.7589257 | 0 |
correlations_to_explore <-
c(
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
pull(var_i),
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
pull(var_j)
) %>%
unique()
correlations_to_explore_panel <-
census_data_trans %>%
slice_sample(prop = 0.01) %>%
select({{correlations_to_explore}}) %>%
ggpairs(
upper = list(continuous = wrap(ggally_cor, method = "kendall")),
lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))
)
print(correlations_to_explore_panel)
# ggsave(
# "../100-prep/111-classification-variable-selection-top-correlations-v0_0_4.png",
# correlations_to_explore_panel,
# width = 900,
# height = 900,
# units = "mm",
# dpi=300
# )
The figure below is an annotated version of the plot above.